- Read the CSV (hard‑coded folder)
library(tidyverse)
knitr::opts_chunk$set(
message = FALSE,
warning = FALSE,
fig.width = 14,
fig.height = 8,
out.width = "100%",
dpi = 150
)
# ---- User config ----
folder <- "C:/Users/dunnmk/Downloads/PD_Data/PD_Data" # CHANGE THIS PATH
time_ref <- 0
time_cmp <- 180
# Plot toggles
trans_combos_to_plot <- c("A_to_D", "C_to_B")
show_extra_fc_plots <- FALSE
dsb_for_cis_correlation <- "DSB1"
# Optional filters (set any to NULL to include everything)
filters <- list(
batch = NULL,
time_point = NULL,
DSB = NULL,
combo = NULL
)
# ---- Small helpers ----
pct <- function(num, den) dplyr::if_else(den > 0, 100 * num / den, NA_real_)
fc_ratio <- function(num, den, eps = 1e-6) {
# Avoid misleading 1.0 fold-changes when both numerator and denominator are truly 0.
# (Previously, eps/eps = 1.0.)
dplyr::case_when(
is.na(num) | is.na(den) ~ NA_real_,
abs(num) < eps & abs(den) < eps ~ NA_real_,
TRUE ~ (num + eps) / (den + eps)
)
}
apply_filters <- function(df, filters = list()) {
if (!is.null(filters$batch) && "batch" %in% names(df)) {
df <- df %>% dplyr::filter(batch %in% filters$batch)
}
if (!is.null(filters$time_point) && "time_point" %in% names(df)) {
df <- df %>% dplyr::filter(time_point %in% filters$time_point)
}
if (!is.null(filters$DSB) && "DSB" %in% names(df)) {
df <- df %>% dplyr::filter(DSB %in% filters$DSB)
}
if (!is.null(filters$combo) && "combo" %in% names(df)) {
df <- df %>% dplyr::filter(combo %in% filters$combo)
}
df
}
theme_allele_x <- function() {
theme_bw(base_size = 14) +
theme(
axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5, size = 10),
axis.text.y = element_text(size = 11),
strip.text = element_text(size = 13),
plot.title = element_text(size = 16, face = "bold"),
plot.margin = margin(5.5, 15, 5.5, 5.5)
)
}
stopifnot(dir.exists(folder))
files <- list.files(folder, pattern = "_summary\\.csv$", full.names = TRUE)
stopifnot(length(files) > 0)
raw <- purrr::map_dfr(files, readr::read_csv, show_col_types = FALSE)
# Standardize time column name
if ("time" %in% names(raw) && !"time_point" %in% names(raw)) {
raw <- raw %>% dplyr::rename(time_point = time)
}
raw <- raw %>% dplyr::mutate(time_point = as.numeric(time_point))
filtered <- apply_filters(raw, filters)
filtered |> head(10) |> knitr::kable(caption = "Combined CSV preview (after optional filters)")
Combined CSV preview (after optional filters)
| CIS_DSB1_FULL_CHRIII_L02 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIII_L02 |
51283 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIII_L03 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIII_L03 |
38174 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIII_L04 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIII_L04 |
228463 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L01 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L01 |
49541 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L03 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L03 |
32857 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L06_2 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L06_2 |
54088 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L10_2 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L10_2 |
39551 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L12_2 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L12_2 |
44777 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L13 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L13 |
29708 |
batch6 |
0 |
| CIS_DSB1_FULL_CHRIV_L15_2 |
CIS |
DSB1 |
A_to_B |
FULL |
CHRIV_L15_2 |
51410 |
batch6 |
0 |
- Aggregate counts and percentages
Aggregate counts from DSB monitoring pulldown experiment
summarize_counts_pct <- function(df) {
df %>%
summarise(
Total_Counts = sum(count),
Cis_Counts = sum(count[cis_trans == "CIS"]),
Trans_Counts = sum(count[cis_trans == "TRANS"]),
A_to_B_Counts = sum(count[combo == "A_to_B"]),
C_to_D_Counts = sum(count[combo == "C_to_D"]),
A_to_D_Counts = sum(count[combo == "A_to_D"]),
C_to_B_Counts = sum(count[combo == "C_to_B"]),
Percent_Cis = pct(Cis_Counts, Total_Counts),
Percent_Trans = pct(Trans_Counts, Total_Counts),
Percent_A_to_B = pct(A_to_B_Counts, Total_Counts),
Percent_C_to_D = pct(C_to_D_Counts, Total_Counts),
Percent_A_to_D = pct(A_to_D_Counts, Total_Counts),
Percent_C_to_B = pct(C_to_B_Counts, Total_Counts),
.groups = "drop"
)
}
agg_counts <- filtered %>%
group_by(batch, time_point, DSB) %>%
summarize_counts_pct()
agg_counts |> knitr::kable(caption = "Aggregated counts and percentages")
Aggregated counts and percentages
| batch6 |
0 |
DSB1 |
1238149 |
1238149 |
0 |
1238149 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| batch6 |
0 |
DSB2 |
1777638 |
1777638 |
0 |
0 |
1777638 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| batch6 |
0 |
TRANS |
7447 |
0 |
7447 |
0 |
0 |
4588 |
2859 |
0 |
100 |
0 |
0 |
61.60870 |
38.39130 |
| batch6 |
180 |
DSB1 |
1549523 |
1549523 |
0 |
1549523 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| batch6 |
180 |
DSB2 |
1759385 |
1759385 |
0 |
0 |
1759385 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| batch6 |
180 |
TRANS |
230252 |
0 |
230252 |
0 |
0 |
162974 |
67278 |
0 |
100 |
0 |
0 |
70.78071 |
29.21929 |
| batch8 |
0 |
DSB1 |
483724 |
483724 |
0 |
483724 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| batch8 |
0 |
DSB2 |
1057475 |
1057475 |
0 |
0 |
1057475 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| batch8 |
0 |
TRANS |
1284 |
0 |
1284 |
0 |
0 |
749 |
535 |
0 |
100 |
0 |
0 |
58.33333 |
41.66667 |
| batch8 |
180 |
DSB1 |
1511971 |
1511971 |
0 |
1511971 |
0 |
0 |
0 |
100 |
0 |
100 |
0 |
0.00000 |
0.00000 |
| batch8 |
180 |
DSB2 |
1726168 |
1726168 |
0 |
0 |
1726168 |
0 |
0 |
100 |
0 |
0 |
100 |
0.00000 |
0.00000 |
| batch8 |
180 |
TRANS |
254293 |
0 |
254293 |
0 |
0 |
174852 |
79441 |
0 |
100 |
0 |
0 |
68.76005 |
31.23995 |
2.5. Batch-level QC plots (counts and CIS/TRANS composition)
These plots are intended to sanity-check overall depth and
composition before allele-level cis/trans normalization.
# Re-use the same aggregation logic as the table above
summarize_qc_by_batch <- function(df) {
df %>%
group_by(batch, time_point, DSB) %>%
summarize_counts_pct()
}
add_allbatch_shares <- function(agg_qc) {
totals <- agg_qc %>%
group_by(time_point) %>%
summarise(
Total_AllBatches = sum(Total_Counts),
Total_Cis_AllBatches = sum(Cis_Counts),
Total_Trans_AllBatches = sum(Trans_Counts),
.groups = "drop"
)
agg_qc %>%
left_join(totals, by = "time_point") %>%
mutate(
Percent_Total_of_AllBatches = pct(Total_Counts, Total_AllBatches),
Percent_Cis_of_AllBatches = pct(Cis_Counts, Total_Cis_AllBatches),
Percent_Trans_of_AllBatches = pct(Trans_Counts, Total_Trans_AllBatches)
)
}
# Main QC table
agg_qc <- summarize_qc_by_batch(filtered)
# Percent-of-total across ALL batches (within each time point).
# These percentages are intended to SUM TO 100 across batches (and DSB bars) per time point.
agg_qc_allbatches <- add_allbatch_shares(agg_qc)
# ---- 1) Total counts by batch ----
p_total_counts <- ggplot(agg_qc, aes(x = batch, y = Total_Counts, fill = DSB)) +
geom_col(position = position_dodge(width = 0.8), width = 0.75) +
geom_text(
aes(label = scales::comma(Total_Counts)),
position = position_dodge(width = 0.8),
vjust = -0.25,
size = 3.4
) +
facet_wrap(~ time_point, scales = "free_y") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Total counts by batch (faceted by time)",
x = "Batch",
y = "Total counts"
)
if (nrow(agg_qc) > 0) {
print(p_total_counts)
}

# Optional: total-count share across ALL batches (sums to 100 within each time point)
p_total_counts_share <- ggplot(agg_qc_allbatches, aes(x = batch, y = Percent_Total_of_AllBatches, fill = DSB)) +
geom_col(position = position_dodge(width = 0.8), width = 0.75) +
geom_text(
aes(label = if_else(is.na(Percent_Total_of_AllBatches), NA_character_, paste0(round(Percent_Total_of_AllBatches, 1), "%"))),
position = position_dodge(width = 0.8),
vjust = -0.25,
size = 3.2
) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "Total-count share by batch (of ALL batches) — bars sum to 100",
x = "Batch",
y = "% of total counts (across all batches)",
fill = "DSB"
)
if (nrow(agg_qc_allbatches) > 0 && any(!is.na(agg_qc_allbatches$Percent_Total_of_AllBatches))) {
print(p_total_counts_share)
}

# ---- 2) Counts by category within each batch (split CIS-only vs TRANS-only) ----
cis_counts_long <- agg_qc %>%
select(
batch, time_point, DSB,
Cis_Counts,
A_to_B_Counts, C_to_D_Counts
) %>%
pivot_longer(
cols = -c(batch, time_point, DSB),
names_to = "Metric",
values_to = "Counts"
) %>%
mutate(
Metric = factor(
Metric,
levels = c("Cis_Counts", "A_to_B_Counts", "C_to_D_Counts"),
labels = c("CIS (total)", "A→B (cis)", "C→D (cis)")
)
)
p_cis_counts_by_type <- ggplot(cis_counts_long, aes(x = batch, y = Counts, fill = DSB)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = scales::comma(Counts)),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 3.0
) +
facet_grid(Metric ~ time_point, scales = "free_y") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "CIS counts by category per batch — all DSBs on one graph",
x = "Batch",
y = "Counts",
fill = "DSB"
)
if (nrow(cis_counts_long) > 0) {
print(p_cis_counts_by_type)
}

trans_counts_long <- agg_qc %>%
select(
batch, time_point, DSB,
Trans_Counts,
A_to_D_Counts, C_to_B_Counts
) %>%
pivot_longer(
cols = -c(batch, time_point, DSB),
names_to = "Metric",
values_to = "Counts"
) %>%
mutate(
Metric = factor(
Metric,
levels = c("Trans_Counts", "A_to_D_Counts", "C_to_B_Counts"),
labels = c("TRANS (total)", "A→D (trans)", "C→B (trans)")
)
)
p_trans_counts_by_type <- ggplot(trans_counts_long, aes(x = batch, y = Counts, fill = DSB)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(
aes(label = scales::comma(Counts)),
position = position_dodge(width = 0.85),
vjust = -0.25,
size = 3.0
) +
facet_grid(Metric ~ time_point, scales = "free_y") +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "TRANS counts by category per batch — all DSBs on one graph",
x = "Batch",
y = "Counts",
fill = "DSB"
)
if (nrow(trans_counts_long) > 0) {
print(p_trans_counts_by_type)
}

# ---- 3) Percent CIS and Percent TRANS per group (split) ----
pct_cis_only <- agg_qc_allbatches %>%
select(batch, time_point, DSB, Percent_Cis_of_AllBatches) %>%
mutate(Percent_Cis_Label = if_else(is.na(Percent_Cis_of_AllBatches), NA_character_, paste0(round(Percent_Cis_of_AllBatches, 1), "%")))
p_pct_cis <- ggplot(pct_cis_only, aes(x = batch, y = Percent_Cis_of_AllBatches, fill = DSB)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(aes(label = Percent_Cis_Label), position = position_dodge(width = 0.85), vjust = -0.25, size = 3.2) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "CIS share by batch (of ALL batches) — bars sum to 100",
x = "Batch",
y = "% of CIS counts (across all batches)",
fill = "DSB"
)
if (nrow(pct_cis_only) > 0 && any(!is.na(pct_cis_only$Percent_Cis_of_AllBatches))) {
print(p_pct_cis)
}

pct_trans_only <- agg_qc_allbatches %>%
select(batch, time_point, DSB, Percent_Trans_of_AllBatches) %>%
mutate(Percent_Trans_Label = if_else(is.na(Percent_Trans_of_AllBatches), NA_character_, paste0(round(Percent_Trans_of_AllBatches, 1), "%")))
p_pct_trans <- ggplot(pct_trans_only, aes(x = batch, y = Percent_Trans_of_AllBatches, fill = DSB)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
geom_text(aes(label = Percent_Trans_Label), position = position_dodge(width = 0.85), vjust = -0.25, size = 3.2) +
facet_wrap(~ time_point) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "TRANS share by batch (of ALL batches) — bars sum to 100",
x = "Batch",
y = "% of TRANS counts (across all batches)",
fill = "DSB"
)
if (nrow(pct_trans_only) > 0 && any(!is.na(pct_trans_only$Percent_Trans_of_AllBatches))) {
print(p_pct_trans)
}

# ---- 4) CIS/TRANS combo share (of ALL batches) ----
# These plots are computed as % of ALL CIS (or ALL TRANS) counts across batches,
# so the bars/segments within each time point SUM TO 100.
summarize_combo_counts <- function(df) {
df %>%
group_by(batch, time_point, DSB, cis_trans, combo) %>%
summarise(Combo_Counts = sum(count), .groups = "drop")
}
combo_counts <- summarize_combo_counts(filtered) %>%
mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))
# CIS: only A_to_B and C_to_D
cis_combo_counts <- combo_counts %>%
filter(cis_trans == "CIS", combo %in% c("A_to_B", "C_to_D"))
cis_totals <- cis_combo_counts %>%
group_by(time_point) %>%
summarise(Total_Cis_2Combo_AllBatches = sum(Combo_Counts), .groups = "drop")
cis_combo_share <- cis_combo_counts %>%
left_join(cis_totals, by = "time_point") %>%
mutate(
Percent = pct(Combo_Counts, Total_Cis_2Combo_AllBatches),
Combo = dplyr::recode(combo, A_to_B = "A→B (CIS)", C_to_D = "C→D (CIS)")
)
p_cis_combo_share <- ggplot(
cis_combo_share,
aes(x = interaction(batch, DSB, sep = " | "), y = Percent, fill = Combo)
) +
geom_col(width = 0.8) +
geom_text(
aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
position = position_stack(vjust = 0.5),
size = 3.0
) +
facet_wrap(~ time_point) +
scale_x_discrete(labels = function(x) sub(" \\|.*$", "", x)) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "CIS combo share (A→B + C→D) of ALL batches — bars sum to 100",
x = "Batch",
y = "% of CIS counts (across all batches)",
fill = ""
)
if (nrow(cis_combo_share) > 0 && any(!is.na(cis_combo_share$Percent))) {
print(p_cis_combo_share)
}

# TRANS: only A_to_D and C_to_B
trans_combo_counts <- combo_counts %>%
filter(cis_trans == "TRANS", combo %in% c("A_to_D", "C_to_B"))
trans_totals <- trans_combo_counts %>%
group_by(time_point) %>%
summarise(Total_Trans_2Combo_AllBatches = sum(Combo_Counts), .groups = "drop")
trans_combo_share <- trans_combo_counts %>%
left_join(trans_totals, by = "time_point") %>%
mutate(
Percent = pct(Combo_Counts, Total_Trans_2Combo_AllBatches),
Combo = dplyr::recode(combo, A_to_D = "A→D (TRANS)", C_to_B = "C→B (TRANS)")
)
p_trans_combo_share <- ggplot(
trans_combo_share,
aes(x = interaction(batch, DSB, sep = " | "), y = Percent, fill = Combo)
) +
geom_col(width = 0.8) +
geom_text(
aes(label = if_else(is.na(Percent) | Percent <= 0, NA_character_, paste0(round(Percent, 1), "%"))),
position = position_stack(vjust = 0.5),
size = 3.0
) +
facet_wrap(~ time_point) +
scale_x_discrete(labels = function(x) sub(" \\|.*$", "", x)) +
scale_y_continuous(limits = c(0, 100)) +
theme_bw(base_size = 14) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(
title = "TRANS combo share (A→D + C→B) of ALL batches — bars sum to 100",
x = "Batch",
y = "% of TRANS counts (across all batches)",
fill = ""
)
if (nrow(trans_combo_share) > 0 && any(!is.na(trans_combo_share$Percent))) {
print(p_trans_combo_share)
}
*** 3. Cis/trans normalization — per allele
location_pct_by_allele <- function(df) {
by_allele <- df %>%
group_by(batch, time_point, DSB, allele) %>%
summarise(
Cis_Location_Counts = sum(count[cis_trans == "CIS"]),
Trans_Location_Counts = sum(count[cis_trans == "TRANS"]),
.groups = "drop"
)
totals <- by_allele %>%
group_by(batch, time_point, DSB) %>%
summarise(
Total_Cis_Location_Counts = sum(Cis_Location_Counts),
Total_Trans_Location_Counts = sum(Trans_Location_Counts),
.groups = "drop"
)
by_allele %>%
left_join(totals, by = c("batch", "time_point", "DSB")) %>%
mutate(
Total_Group_Counts = Total_Cis_Location_Counts + Total_Trans_Location_Counts,
# Percent definitions requested:
# CIS% = cis_counts / total_counts (within the group)
# TRANS% = trans_counts / total_counts (within the group)
Percent_Cis_of_GroupTotal = pct(Cis_Location_Counts, Total_Group_Counts),
Percent_Trans_of_GroupTotal = pct(Trans_Location_Counts, Total_Group_Counts),
Percent_Location_in_Cis = pct(Cis_Location_Counts, Total_Cis_Location_Counts),
Percent_Location_in_Trans = pct(Trans_Location_Counts, Total_Trans_Location_Counts)
)
}
dat_norm <- location_pct_by_allele(filtered)
dat_norm |> head(10) |> knitr::kable(caption = "Data after cis/trans normalization")
Data after cis/trans normalization
| batch6 |
0 |
DSB1 |
CHRIII_L02 |
52550 |
0 |
1238149 |
0 |
1238149 |
4.244239 |
0 |
4.244239 |
NA |
| batch6 |
0 |
DSB1 |
CHRIII_L03 |
39051 |
0 |
1238149 |
0 |
1238149 |
3.153982 |
0 |
3.153982 |
NA |
| batch6 |
0 |
DSB1 |
CHRIII_L04 |
234459 |
0 |
1238149 |
0 |
1238149 |
18.936251 |
0 |
18.936251 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L01 |
50897 |
0 |
1238149 |
0 |
1238149 |
4.110733 |
0 |
4.110733 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L03 |
33625 |
0 |
1238149 |
0 |
1238149 |
2.715747 |
0 |
2.715747 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L06_2 |
55461 |
0 |
1238149 |
0 |
1238149 |
4.479348 |
0 |
4.479348 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L10_2 |
40604 |
0 |
1238149 |
0 |
1238149 |
3.279411 |
0 |
3.279411 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L12_2 |
45904 |
0 |
1238149 |
0 |
1238149 |
3.707470 |
0 |
3.707470 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L13 |
30438 |
0 |
1238149 |
0 |
1238149 |
2.458347 |
0 |
2.458347 |
NA |
| batch6 |
0 |
DSB1 |
CHRIV_L15_2 |
52669 |
0 |
1238149 |
0 |
1238149 |
4.253850 |
0 |
4.253850 |
NA |
- Allele frequency (CIS + TRANS)
allele_frequency <- function(df) {
by_allele <- df %>%
group_by(batch, time_point, DSB, allele) %>%
summarise(
Cis_Counts = sum(count[cis_trans == "CIS"]),
Trans_Counts = sum(count[cis_trans == "TRANS"]),
Allele_Total = Cis_Counts + Trans_Counts,
.groups = "drop"
)
totals <- by_allele %>%
group_by(batch, time_point, DSB) %>%
summarise(
Total_All = sum(Allele_Total),
.groups = "drop"
)
by_allele %>%
left_join(totals, by = c("batch", "time_point", "DSB")) %>%
mutate(Allele_Frequency = dplyr::if_else(Total_All > 0, Allele_Total / Total_All, NA_real_))
}
dat_allele_freq <- allele_frequency(filtered)
dat_allele_freq |> head(10) |> knitr::kable(caption = "Allele frequencies")
Allele frequencies
| batch6 |
0 |
DSB1 |
CHRIII_L02 |
52550 |
0 |
52550 |
1238149 |
0.0424424 |
| batch6 |
0 |
DSB1 |
CHRIII_L03 |
39051 |
0 |
39051 |
1238149 |
0.0315398 |
| batch6 |
0 |
DSB1 |
CHRIII_L04 |
234459 |
0 |
234459 |
1238149 |
0.1893625 |
| batch6 |
0 |
DSB1 |
CHRIV_L01 |
50897 |
0 |
50897 |
1238149 |
0.0411073 |
| batch6 |
0 |
DSB1 |
CHRIV_L03 |
33625 |
0 |
33625 |
1238149 |
0.0271575 |
| batch6 |
0 |
DSB1 |
CHRIV_L06_2 |
55461 |
0 |
55461 |
1238149 |
0.0447935 |
| batch6 |
0 |
DSB1 |
CHRIV_L10_2 |
40604 |
0 |
40604 |
1238149 |
0.0327941 |
| batch6 |
0 |
DSB1 |
CHRIV_L12_2 |
45904 |
0 |
45904 |
1238149 |
0.0370747 |
| batch6 |
0 |
DSB1 |
CHRIV_L13 |
30438 |
0 |
30438 |
1238149 |
0.0245835 |
| batch6 |
0 |
DSB1 |
CHRIV_L15_2 |
52669 |
0 |
52669 |
1238149 |
0.0425385 |
- Fold change calculations (time_cmp / time_ref)
fold_change_table <- function(df_norm, value_col, time_ref, time_cmp) {
df_norm %>%
filter(time_point %in% c(time_ref, time_cmp)) %>%
select(batch, time_point, DSB, allele, !!rlang::sym(value_col)) %>%
pivot_wider(
names_from = time_point,
values_from = !!rlang::sym(value_col),
values_fill = 0
) %>%
mutate(
FoldChange = fc_ratio(.data[[as.character(time_cmp)]], .data[[as.character(time_ref)]]),
Log2FC = log2(FoldChange)
)
}
# Fold change based on allele frequency (CIS + TRANS combined).
dat_fc_allele_freq <- fold_change_table(dat_allele_freq, "Allele_Frequency", time_ref, time_cmp) %>%
rename(
FoldChange_AlleleFreq_vs_ref = FoldChange,
Log2FC_AlleleFreq_vs_ref = Log2FC
)
dat_fc_cis <- fold_change_table(dat_norm, "Percent_Cis_of_GroupTotal", time_ref, time_cmp) %>%
rename(
FoldChange_Cis_vs_ref = FoldChange,
Log2FC_Cis_vs_ref = Log2FC
)
dat_fc_trans <- fold_change_table(dat_norm, "Percent_Trans_of_GroupTotal", time_ref, time_cmp) %>%
rename(
FoldChange_Trans_vs_ref = FoldChange,
Log2FC_Trans_vs_ref = Log2FC
)
dat_fc_allele_freq %>%
mutate(
FoldChange_AlleleFreq_vs_ref = round(FoldChange_AlleleFreq_vs_ref, 6),
Log2FC_AlleleFreq_vs_ref = round(Log2FC_AlleleFreq_vs_ref, 6)
) %>%
head(10) %>%
knitr::kable(caption = "Fold change (time_cmp/time_ref) based on allele frequency")
Fold change (time_cmp/time_ref) based on allele
frequency
| batch6 |
DSB1 |
CHRIII_L02 |
0.0424424 |
0.0147833 |
0.348329 |
-1.521478 |
| batch6 |
DSB1 |
CHRIII_L03 |
0.0315398 |
0.0366422 |
1.161772 |
0.216327 |
| batch6 |
DSB1 |
CHRIII_L04 |
0.1893625 |
0.1869349 |
0.987180 |
-0.018614 |
| batch6 |
DSB1 |
CHRIV_L01 |
0.0411073 |
0.0374831 |
0.911838 |
-0.133150 |
| batch6 |
DSB1 |
CHRIV_L03 |
0.0271575 |
0.0100328 |
0.369452 |
-1.436539 |
| batch6 |
DSB1 |
CHRIV_L06_2 |
0.0447935 |
0.0453636 |
1.012728 |
0.018247 |
| batch6 |
DSB1 |
CHRIV_L10_2 |
0.0327941 |
0.0336897 |
1.027309 |
0.038870 |
| batch6 |
DSB1 |
CHRIV_L12_2 |
0.0370747 |
0.0496959 |
1.340418 |
0.422683 |
| batch6 |
DSB1 |
CHRIV_L13 |
0.0245835 |
0.0267431 |
1.087844 |
0.121472 |
| batch6 |
DSB1 |
CHRIV_L15_2 |
0.0425385 |
0.0526704 |
1.238176 |
0.308217 |
# Make the later plots larger/more readable.
# (This updates defaults for subsequent chunks only.)
knitr::opts_chunk$set(
fig.width = 16,
fig.height = 9,
out.width = "100%",
dpi = 160
)
- CIS and TRANS bar plots
plot_cis_percent_by_batch <- function(dat_norm) {
df <- dat_norm %>%
filter(!is.na(Percent_Cis_of_GroupTotal)) %>%
mutate(
time_point = factor(time_point, levels = sort(unique(time_point)))
)
batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- df %>% filter(batch == b)
dsbs <- df_b %>%
distinct(DSB) %>%
arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
pull(DSB)
for (d in dsbs) {
df_bd <- df_b %>% filter(DSB == d)
if (nrow(df_bd) == 0 || all(is.na(df_bd$Percent_Cis_of_GroupTotal))) next
p <- ggplot(df_bd, aes(x = allele, y = Percent_Cis_of_GroupTotal)) +
geom_col(width = 0.85, fill = "#2C7FB8") +
geom_text(
aes(label = if_else(is.na(Percent_Cis_of_GroupTotal), NA_character_, paste0(round(Percent_Cis_of_GroupTotal, 1), "%"))),
vjust = -0.25,
size = 3.0,
angle = 90
) +
facet_wrap(~ time_point, scales = "free_x") +
scale_y_continuous(limits = c(0, 100)) +
theme_allele_x() +
labs(
title = paste0("CIS% by allele (cis_counts / total_counts) | Batch: ", b, " | ", d),
x = "Allele",
y = "CIS% of total counts (within group)"
)
print(p)
}
}
}
plot_cis_percent_by_batch(dat_norm)






plot_trans_percent_by_batch <- function(dat_norm) {
df <- dat_norm %>%
filter(!is.na(Percent_Trans_of_GroupTotal)) %>%
mutate(
time_point = factor(time_point, levels = sort(unique(time_point)))
)
batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- df %>% filter(batch == b)
dsbs <- df_b %>%
distinct(DSB) %>%
arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
pull(DSB)
for (d in dsbs) {
df_bd <- df_b %>% filter(DSB == d)
if (nrow(df_bd) == 0 || all(is.na(df_bd$Percent_Trans_of_GroupTotal))) next
p <- ggplot(df_bd, aes(x = allele, y = Percent_Trans_of_GroupTotal)) +
geom_col(width = 0.85, fill = "#F03B20") +
geom_text(
aes(label = if_else(is.na(Percent_Trans_of_GroupTotal), NA_character_, paste0(round(Percent_Trans_of_GroupTotal, 1), "%"))),
vjust = -0.25,
size = 3.0,
angle = 90
) +
facet_wrap(~ time_point, scales = "free_x") +
scale_y_continuous(limits = c(0, 100)) +
theme_allele_x() +
labs(
title = paste0("TRANS% by allele (trans_counts / total_counts) | Batch: ", b, " | ", d),
x = "Allele",
y = "TRANS% of total counts (within group)"
)
print(p)
}
}
}
plot_trans_percent_by_batch(dat_norm)






# Optional: within-TRANS combo breakdowns with T0/T120 on the SAME graph.
# (Bars are colored by time.)
plot_trans_combo_by_time <- function(dat, combo_name, time_levels = c(0, 120)) {
df_counts <- dat %>%
filter(combo == combo_name, time_point %in% time_levels) %>%
group_by(batch, time_point, DSB, allele) %>%
summarise(Combo_Counts = sum(count), .groups = "drop")
if (nrow(df_counts) == 0) return(NULL)
totals <- df_counts %>%
group_by(batch, time_point, DSB) %>%
summarise(Total_Combo = sum(Combo_Counts), .groups = "drop")
df_plot <- df_counts %>%
left_join(totals, by = c("batch", "time_point", "DSB")) %>%
mutate(
Percent_in_Combo = pct(Combo_Counts, Total_Combo),
time_point = factor(time_point, levels = time_levels, labels = paste0("T", time_levels))
)
if (nrow(df_plot) == 0 || all(is.na(df_plot$Percent_in_Combo))) return(NULL)
ggplot(df_plot, aes(x = allele, y = Percent_in_Combo, fill = time_point)) +
geom_col(position = position_dodge(width = 0.9), width = 0.85) +
geom_text(
aes(label = if_else(is.na(Percent_in_Combo), NA_character_, paste0(round(Percent_in_Combo, 1), "%"))),
position = position_dodge(width = 0.9),
vjust = -0.25,
size = 2.0,
angle = 90
) +
facet_grid(batch + DSB ~ ., scales = "free_x") +
scale_y_continuous(limits = c(0, 100)) +
theme_allele_x() +
labs(
title = paste0("Percent of ", combo_name, " counts by allele (T0 vs T120)"),
x = "Allele",
y = "% within combo (within group)",
fill = "Time"
)
}
for (cmb in trans_combos_to_plot) {
p <- plot_trans_combo_by_time(filtered, cmb, time_levels = c(0, 120))
if (!is.null(p)) print(p)
}


- Fold change and allele frequency plots
plot_foldchange_by_batch <- function(df, fc_col, title_prefix) {
df <- df %>% mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))
batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- df %>% filter(batch == b)
dsbs <- df_b %>%
distinct(DSB) %>%
arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
pull(DSB)
for (d in dsbs) {
df_bd <- df_b %>% filter(DSB == d)
if (nrow(df_bd) == 0 || all(is.na(df_bd[[fc_col]]))) next
p <- ggplot(df_bd, aes(x = allele, y = .data[[fc_col]])) +
geom_col(width = 0.85, fill = "#636363") +
geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
geom_text(
aes(label = if_else(is.na(.data[[fc_col]]), NA_character_, sprintf("%.4f", .data[[fc_col]]))),
vjust = -0.25,
size = 2.8,
angle = 90
) +
theme_allele_x() +
labs(
title = paste0(title_prefix, " | Batch: ", b, " | ", d),
x = "Allele",
y = "Fold change"
)
print(p)
}
}
}
plot_allele_frequency_by_batch <- function(dat_allele_freq) {
df <- dat_allele_freq %>% mutate(DSB = factor(DSB, levels = c("DSB1", "DSB2", "TRANS")))
batches <- df %>% distinct(batch) %>% arrange(batch) %>% pull(batch)
for (b in batches) {
df_b <- df %>% filter(batch == b)
dsbs <- df_b %>%
distinct(DSB) %>%
arrange(factor(DSB, levels = c("DSB1", "DSB2", "TRANS"))) %>%
pull(DSB)
for (d in dsbs) {
df_bd <- df_b %>% filter(DSB == d)
if (nrow(df_bd) == 0 || all(is.na(df_bd$Allele_Frequency))) next
p <- ggplot(df_bd, aes(x = allele, y = Allele_Frequency)) +
geom_col(width = 0.85, fill = "#31A354") +
geom_text(
aes(label = if_else(is.na(Allele_Frequency), NA_character_, sprintf("%.3f", Allele_Frequency))),
vjust = -0.25,
size = 2.8,
angle = 90
) +
facet_wrap(~ time_point, scales = "free_x") +
theme_allele_x() +
labs(
title = paste0("Allele Frequency (CIS + TRANS) | Batch: ", b, " | ", d),
x = "Allele",
y = "Allele Frequency"
)
print(p)
}
}
}
# Generate updated plots
plot_foldchange_by_batch(
dat_fc_cis,
"FoldChange_Cis_vs_ref",
paste0("Fold change (", time_cmp, "/", time_ref, ") CIS by allele")
)




plot_foldchange_by_batch(
dat_fc_trans,
"FoldChange_Trans_vs_ref",
paste0("Fold change (", time_cmp, "/", time_ref, ") TRANS by allele")
)


plot_foldchange_by_batch(
dat_fc_allele_freq,
"FoldChange_AlleleFreq_vs_ref",
paste0("Fold change (", time_cmp, "/", time_ref, ") Allele frequency (CIS + TRANS) by allele")
)






plot_allele_frequency_by_batch(dat_allele_freq)






if (isTRUE(show_extra_fc_plots)) {
if (nrow(dat_fc_cis) == 0 || nrow(dat_fc_trans) == 0) {
message("Skipping extra FC plots: no data in dat_fc_cis/dat_fc_trans")
} else {
p1 <- ggplot(dat_fc_cis, aes(x = allele, y = FoldChange_Cis_vs_ref, fill = DSB)) +
geom_col(width = 0.9, position = "dodge") +
geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
geom_text(aes(label = round(FoldChange_Cis_vs_ref, 2)),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 2.5, angle = 90) +
facet_wrap(~ batch, scales = "free_x") +
theme_allele_x() +
labs(
title = paste0("Fold change (", time_cmp, "/", time_ref, ") CIS by allele"),
x = "Allele", y = "Fold change"
)
p2 <- ggplot(dat_fc_trans, aes(x = allele, y = FoldChange_Trans_vs_ref, fill = DSB)) +
geom_col(width = 0.9, position = "dodge") +
geom_hline(yintercept = 2, linetype = "dotted", color = "red", size = 0.7) +
geom_text(aes(label = round(FoldChange_Trans_vs_ref, 2)),
position = position_dodge(width = 0.9),
vjust = -0.3, size = 2.5, angle = 90) +
facet_wrap(~ batch, scales = "free_x") +
theme_allele_x() +
labs(
title = paste0("Fold change (", time_cmp, "/", time_ref, ") TRANS by allele"),
x = "Allele", y = "Fold change"
)
print(p1)
print(p2)
}
}
- Correlation: log2FC vs Allele Frequency
library(ggrepel)
dat_wide <- dat_norm %>%
filter(time_point %in% c(time_ref, time_cmp)) %>%
select(batch, DSB, allele, time_point,
Percent_Cis_of_GroupTotal, Percent_Trans_of_GroupTotal) %>%
pivot_wider(
names_from = time_point,
values_from = c(Percent_Cis_of_GroupTotal, Percent_Trans_of_GroupTotal),
values_fill = 0
) %>%
mutate(
log2FC_CIS = log2(fc_ratio(.data[[paste0("Percent_Cis_of_GroupTotal_", time_cmp)]],
.data[[paste0("Percent_Cis_of_GroupTotal_", time_ref)]])),
log2FC_TRANS = log2(fc_ratio(.data[[paste0("Percent_Trans_of_GroupTotal_", time_cmp)]],
.data[[paste0("Percent_Trans_of_GroupTotal_", time_ref)]]))
)
dat_fc_af <- dat_wide %>%
inner_join(
dat_allele_freq %>%
filter(time_point == time_cmp) %>%
select(batch, DSB, allele, Allele_Frequency),
by = c("batch", "DSB", "allele")
) %>%
filter(!is.na(Allele_Frequency) & Allele_Frequency > 0)
cor_summary <- dat_fc_af %>%
group_by(batch, DSB) %>%
summarise(
cor_CIS_AF = ifelse(n() >= 2, cor(log2FC_CIS, Allele_Frequency), NA),
cor_TRANS_AF = ifelse(n() >= 2, cor(log2FC_TRANS, Allele_Frequency), NA),
n_obs = n(),
.groups = "drop"
)
knitr::kable(cor_summary, caption = "Correlation: log2FC vs Allele Frequency")
Correlation: log2FC vs Allele Frequency
| batch6 |
DSB1 |
0.3887672 |
NA |
25 |
| batch6 |
DSB2 |
0.2250024 |
NA |
25 |
| batch6 |
TRANS |
NA |
-0.0156710 |
25 |
| batch8 |
DSB1 |
0.6312123 |
NA |
25 |
| batch8 |
DSB2 |
-0.1127632 |
NA |
25 |
| batch8 |
TRANS |
NA |
0.0249546 |
25 |
plot_correlation_cis <- function(dat_fc_af, batch_name, dsb_name = dsb_for_cis_correlation) {
df_batch <- dat_fc_af %>% filter(batch == batch_name, DSB == dsb_name)
if (nrow(df_batch) < 2) return(NULL)
ggplot(df_batch, aes(x = Allele_Frequency, y = log2FC_CIS, label = allele)) +
geom_point(size = 3, alpha = 0.8, color = "darkgreen") +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
geom_text_repel(size = 3, max.overlaps = 20) +
theme_bw() +
labs(
title = paste0("Log2FC CIS vs Allele Freq | Batch: ", batch_name, " | ", dsb_name),
x = "Allele Frequency", y = "log2FC CIS"
)
}
plot_correlation_trans <- function(dat_fc_af, batch_name) {
df_batch <- dat_fc_af %>% filter(batch == batch_name)
if (nrow(df_batch) < 2) return(NULL)
ggplot(df_batch, aes(x = Allele_Frequency, y = log2FC_TRANS, color = DSB, label = allele)) +
geom_point(size = 3, alpha = 0.8) +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
geom_smooth(method = "lm", se = FALSE) +
geom_text_repel(size = 3, max.overlaps = 20) +
theme_bw() +
labs(
title = paste0("Log2FC TRANS vs Allele Freq | Batch: ", batch_name),
x = "Allele Frequency", y = "log2FC TRANS"
)
}
unique_batches <- unique(dat_fc_af$batch)
for (b in unique_batches) {
p_cis <- plot_correlation_cis(dat_fc_af, b)
p_trans <- plot_correlation_trans(dat_fc_af, b)
if (!is.null(p_cis)) print(p_cis)
if (!is.null(p_trans)) print(p_trans)
}



